The first step is to load the necessary libraries:

library(readr) # Read data files    
library(dplyr) # Manipulation tasks (filtering, selecting, summarizing)     
library(tibble) # Ensures compatibility with tidyverse tools    
library(ggplot2) # Used for plotting
library(DESeq2)  # Perform DEA
library(BiocParallel) # Used for parallel computing
library(EDASeq) # Exploratory data analysis tools for RNA-Seq data
library(clusterProfiler) # Facilitates enrichment analysis
library(enrichplot) # Visualization tools for enrichment analysis
library(pheatmap)  # Correlation matrix plot
library(ggfortify) # PCA autoplot
library(corrplot)  # Plot correlation matrix

The packages to install come either from CRAN (The Comprehensive R Archive Network) or from Bioconductor.

The CRAN packages are straightforward and can be installed like this:

# Install CRAN packages
install.packages("readr")

Bioconductor packages are installed using the BiocManager package, which first we need to install:

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

Afterwards, it is possible to install the Bioconductor packages:

# Install Bioconductor packages
BiocManager::install("DESeq2")

EXERCISE 2

Import both the count matrix and the sample information in R, format both of them to be used for downstream analysis and plot the PCA and correlation matrix.

First we load the necessary libraries and the .txt files we need to complete the exercises.

# Load the metadata
metadata <- read.delim("C:/Users/IhonaCorrea/Desktop/MASTER/COTM/Activity_2/sample_metadata.txt", header = TRUE, sep = "\t")
metadata
##                             FileName SampleName CellType   Status
## 1  MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1    MCL1.DG  luminal   virgin
## 2  MCL1.DH_BC2CTUACXX_CAGATC_L002_R1    MCL1.DH    basal   virgin
## 3  MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1    MCL1.DI    basal pregnant
## 4  MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1    MCL1.DJ    basal pregnant
## 5  MCL1.DK_BC2CTUACXX_TTAGGC_L002_R1    MCL1.DK    basal  lactate
## 6  MCL1.DL_BC2CTUACXX_ATCACG_L002_R1    MCL1.DL    basal  lactate
## 7  MCL1.LA_BC2CTUACXX_GATCAG_L001_R1    MCL1.LA    basal   virgin
## 8  MCL1.LB_BC2CTUACXX_TGACCA_L001_R1    MCL1.LB  luminal   virgin
## 9  MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1    MCL1.LC  luminal pregnant
## 10 MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1    MCL1.LD  luminal pregnant
## 11 MCL1.LE_BC2CTUACXX_TAGCTT_L001_R1    MCL1.LE  luminal  lactate
## 12 MCL1.LF_BC2CTUACXX_CTTGTA_L001_R1    MCL1.LF  luminal  lactate
# Load the counts file
counts <- read.delim("C:/Users/IhonaCorrea/Desktop/MASTER/COTM/Activity_2/GSE60450_Lactation-GenewiseCounts.txt", header = TRUE, sep = "\t")
head(counts)
##   EntrezGeneID Length MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1
## 1       497097   3634                               438
## 2    100503874   3259                                 1
## 3    100038431   1634                                 0
## 4        19888   9747                                 1
## 5        20671   3130                               106
## 6        27395   4203                               309
##   MCL1.DH_BC2CTUACXX_CAGATC_L002_R1 MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1
## 1                               300                                65
## 2                                 0                                 1
## 3                                 0                                 0
## 4                                 1                                 0
## 5                               182                                82
## 6                               234                               337
##   MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1 MCL1.DK_BC2CTUACXX_TTAGGC_L002_R1
## 1                               237                               354
## 2                                 1                                 0
## 3                                 0                                 0
## 4                                 0                                 0
## 5                               105                                43
## 6                               300                               290
##   MCL1.DL_BC2CTUACXX_ATCACG_L002_R1 MCL1.LA_BC2CTUACXX_GATCAG_L001_R1
## 1                               287                                 0
## 2                                 4                                 0
## 3                                 0                                 0
## 4                                 0                                10
## 5                                82                                16
## 6                               270                               560
##   MCL1.LB_BC2CTUACXX_TGACCA_L001_R1 MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1
## 1                                 0                                 0
## 2                                 0                                 0
## 3                                 0                                 0
## 4                                 3                                10
## 5                                25                                18
## 6                               464                               489
##   MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1 MCL1.LE_BC2CTUACXX_TAGCTT_L001_R1
## 1                                 0                                 0
## 2                                 0                                 0
## 3                                 0                                 0
## 4                                 2                                 0
## 5                                 8                                 3
## 6                               328                               307
##   MCL1.LF_BC2CTUACXX_CTTGTA_L001_R1
## 1                                 0
## 2                                 0
## 3                                 0
## 4                                 0
## 5                                10
## 6                               342

Let’s check if the metadata is aligned with the counts matrix:

counts_columns <- colnames(counts)[-c(1, 2)] # Exclude the first two columns that aren't samples
counts_columns
##  [1] "MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1" "MCL1.DH_BC2CTUACXX_CAGATC_L002_R1"
##  [3] "MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1" "MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1"
##  [5] "MCL1.DK_BC2CTUACXX_TTAGGC_L002_R1" "MCL1.DL_BC2CTUACXX_ATCACG_L002_R1"
##  [7] "MCL1.LA_BC2CTUACXX_GATCAG_L001_R1" "MCL1.LB_BC2CTUACXX_TGACCA_L001_R1"
##  [9] "MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1" "MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1"
## [11] "MCL1.LE_BC2CTUACXX_TAGCTT_L001_R1" "MCL1.LF_BC2CTUACXX_CTTGTA_L001_R1"

Let’s confirm that the metadata is aligend with the counts matrix:

all(counts_columns == metadata$FileName) # Should return TRUE
## [1] TRUE

Correlation matrix

corMatrix <- cor(counts[-c(1, 2)], use="c")
# Keep only the sample name (first 7 digits) for better visualization
colnames(corMatrix) <- substr(colnames(corMatrix), 1, 7) 
rownames(corMatrix) <- substr(rownames(corMatrix), 1, 7)
head(corMatrix)
##           MCL1.DG   MCL1.DH   MCL1.DI   MCL1.DJ   MCL1.DK   MCL1.DL   MCL1.LA
## MCL1.DG 1.0000000 0.9771420 0.8418633 0.8204474 0.8108604 0.8575194 0.7324363
## MCL1.DH 0.9771420 1.0000000 0.8708108 0.8490237 0.8120959 0.8377299 0.7324962
## MCL1.DI 0.8418633 0.8708108 1.0000000 0.9909929 0.9479076 0.9317133 0.5684151
## MCL1.DJ 0.8204474 0.8490237 0.9909929 1.0000000 0.9662422 0.9446826 0.5290221
## MCL1.DK 0.8108604 0.8120959 0.9479076 0.9662422 1.0000000 0.9857808 0.4971564
## MCL1.DL 0.8575194 0.8377299 0.9317133 0.9446826 0.9857808 1.0000000 0.5346671
##           MCL1.LB   MCL1.LC   MCL1.LD    MCL1.LE    MCL1.LF
## MCL1.DG 0.7377906 0.2367517 0.2135546 0.05760311 0.06287370
## MCL1.DH 0.7443741 0.2457856 0.2217865 0.05957578 0.06436077
## MCL1.DI 0.5822673 0.2794939 0.2609729 0.13185247 0.13542179
## MCL1.DJ 0.5415909 0.2331733 0.2163069 0.09410234 0.09737514
## MCL1.DK 0.5061827 0.2139352 0.2003178 0.09435293 0.09756887
## MCL1.DL 0.5428115 0.2202740 0.2066864 0.09735470 0.10133613
pheatmap(corMatrix)   

Now we will build the same correlation matrix but differenciating between the 3 status (virgin, pregnant and lactate)

 metadata$FileName <- colnames(corMatrix)
# Ensure the row names of metadata match the column names of corMatrix

rownames(metadata) <- metadata$FileName # Set row names to the sample names
metadata <- metadata[, "Status", drop = FALSE] # Keep only the Status column
all(rownames(metadata) == colnames(corMatrix))
## [1] TRUE
library(pheatmap)

# Plot the correlation matrix with annotation
pheatmap(
  corMatrix,
  main = "Correlation Heatmap by Status",
  annotation_col = metadata, # Add Status annotation
  show_rownames = FALSE,     # Hide row names for readability
  fontsize_col = 8,         
  angle_col = 45             
)

The following correlation plot contains the correlation coefficients as white numbers:

corrplot(corMatrix, order = 'hclust', 
         addrect = 2, addCoef.col = 'white', 
         number.cex = 0.7) 

It is possible to observe higher correlations among samples from the same status.

PCA

Before obtaining the PCA, it is necessary to transpose the counts matrix.

transposed_matrix <- t(counts[-c(1, 2)]) # Transpose the matrix

transposed_matrix <- log2(transposed_matrix + 1) # Transform the counts to log2 scale 

pcaResults <- prcomp(transposed_matrix) # Obtain PCA 
#Plot PCA results
autoplot(pcaResults, data = metadata, colour = 'Status')+
    ggtitle("PCA plot")+
    theme(plot.title = element_text(hjust = 0.5))

summary(pcaResults)
## Importance of components:
##                             PC1     PC2     PC3      PC4      PC5      PC6
## Standard deviation     142.1211 74.4452 42.1418 39.72937 29.06842 20.17033
## Proportion of Variance   0.6323  0.1735  0.0556  0.04941  0.02645  0.01274
## Cumulative Proportion    0.6323  0.8058  0.8614  0.91086  0.93731  0.95005
##                             PC7      PC8      PC9     PC10     PC11      PC12
## Standard deviation     19.01924 18.42876 17.60980 17.26812 16.90820 5.188e-13
## Proportion of Variance  0.01132  0.01063  0.00971  0.00934  0.00895 0.000e+00
## Cumulative Proportion   0.96137  0.97201  0.98171  0.99105  1.00000 1.000e+00

EXERCISE 3

Perform a differential expression analysis (DEA) between virgin and pregnant mice, assuming as reference samples from virgin. Depict a volcano plot showing gene symbol and distinguishing UP- and DOWN-regulated genes.

1. Prepare the data

First of all we filter the metadata to keep only those samples that correspond to virgin and pregnant mice.

metadata <- read.delim("C:/Users/IhonaCorrea/Desktop/MASTER/COTM/Activity_2/sample_metadata.txt", header = TRUE, sep = "\t")
subSamplInfo <- metadata[metadata$Status %in% c("virgin", "pregnant"), ]

We get the file names from the sub sampled metadata and we use them to obtain the columns from the counts matrix that match with that row names. By doing this, we are filtering the counts matrix to keep only samples from virgin and pregnant mice too.

# Get the column names from counts excluding the first two columns
counts_columns <- colnames(counts)[-c(1, 2)] 

# Get the filenames from subSamplInfo
subsampl_files <- subSamplInfo$FileName

# Find the intersection of counts columns and filenames
matching_columns <- intersect(counts_columns, subsampl_files)
subCOUNTS <- counts[, colnames(counts) %in% matching_columns] # Not include the first 2 columns
rownames(subCOUNTS) <- counts$EntrezGeneID  # Set EntrezGeneID as row names
head(subCOUNTS, n=3)
##           MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1 MCL1.DH_BC2CTUACXX_CAGATC_L002_R1
## 497097                                  438                               300
## 100503874                                 1                                 0
## 100038431                                 0                                 0
##           MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1 MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1
## 497097                                   65                               237
## 100503874                                 1                                 1
## 100038431                                 0                                 0
##           MCL1.LA_BC2CTUACXX_GATCAG_L001_R1 MCL1.LB_BC2CTUACXX_TGACCA_L001_R1
## 497097                                    0                                 0
## 100503874                                 0                                 0
## 100038431                                 0                                 0
##           MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1 MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1
## 497097                                    0                                 0
## 100503874                                 0                                 0
## 100038431                                 0                                 0

Before constructing the DESeqDataset object, wee need to ensure that the row names from the metadata (subsampl_files) match and are in the same order as the column names from subCOUNTS ( the filtered counts matrix).

all(colnames(subCOUNTS) == subsampl_files) # has to be TRUE
## [1] TRUE

Also, we ensure that the Status, which is the column differenciating between virgin and pregnant mice, is treated as a factor.

subSamplInfo$Status <- as.factor(subSamplInfo$Status)

2. Construct DESeqDataset object

# construct object
dds <- DESeqDataSetFromMatrix(
  countData = subCOUNTS, 
  colData = subSamplInfo, 
  design = ~ Status
)
print(dds)
## class: DESeqDataSet 
## dim: 27179 8 
## metadata(1): version
## assays(1): counts
## rownames(27179): 497097 100503874 ... 100861691 100504472
## rowData names(0):
## colnames(8): MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1
##   MCL1.DH_BC2CTUACXX_CAGATC_L002_R1 ...
##   MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1 MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1
## colData names(4): FileName SampleName CellType Status

Certain functions can be used to access this information separately: rownames(dds), colnames(dds), counts(dds) and colData(dds).

colData(dds) # Example on how to access the information of the dds object
## DataFrame with 8 rows and 4 columns
##                                                 FileName  SampleName
##                                              <character> <character>
## MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1 MCL1.DG_BC2CTUACXX_A..     MCL1.DG
## MCL1.DH_BC2CTUACXX_CAGATC_L002_R1 MCL1.DH_BC2CTUACXX_C..     MCL1.DH
## MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1 MCL1.DI_BC2CTUACXX_A..     MCL1.DI
## MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1 MCL1.DJ_BC2CTUACXX_C..     MCL1.DJ
## MCL1.LA_BC2CTUACXX_GATCAG_L001_R1 MCL1.LA_BC2CTUACXX_G..     MCL1.LA
## MCL1.LB_BC2CTUACXX_TGACCA_L001_R1 MCL1.LB_BC2CTUACXX_T..     MCL1.LB
## MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1 MCL1.LC_BC2CTUACXX_G..     MCL1.LC
## MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1 MCL1.LD_BC2CTUACXX_G..     MCL1.LD
##                                      CellType   Status
##                                   <character> <factor>
## MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1     luminal virgin  
## MCL1.DH_BC2CTUACXX_CAGATC_L002_R1       basal virgin  
## MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1       basal pregnant
## MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1       basal pregnant
## MCL1.LA_BC2CTUACXX_GATCAG_L001_R1       basal virgin  
## MCL1.LB_BC2CTUACXX_TGACCA_L001_R1     luminal virgin  
## MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1     luminal pregnant
## MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1     luminal pregnant

3. Pre-filtering

Next, we will remove genes that have almost no information in any sample.

dds <- dds[ rowSums(DESeq2::counts(dds)) > 3, ]

# We set the factor level to virgin mice
dds$Status <- relevel(dds$Status , ref = "virgin")

4. Run DESeq

library(BiocParallel)

param <- MulticoreParam()

register(param)
dds <- DESeq(dds, parallel = T)
DEresults<- results(dds)
DEresults <- DEresults[order(DEresults$pvalue),] #sort results by increasing p-value
DEresults
## log2 fold change (MLE): Status pregnant vs virgin 
## Wald test p-value: Status pregnant vs virgin 
## DataFrame with 19220 rows and 6 columns
##         baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##        <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## 77097   3064.078      -1.980240 0.1532796  -12.9191 3.51065e-38 5.95968e-34
## 14758   1554.425       1.464270 0.1248113   11.7319 8.74983e-32 7.42686e-28
## 20656   2783.784       0.802968 0.0687741   11.6754 1.70172e-31 9.62949e-28
## 101214  3224.373      -0.736428 0.0693145  -10.6244 2.29373e-26 9.73460e-23
## 72157    944.784       0.765255 0.0813538    9.4065 5.12917e-21 1.74146e-17
## ...          ...            ...       ...       ...         ...         ...
## 246133  19.94624       -3.75980   2.44800  -1.53586          NA          NA
## 13184    4.26773       -4.63749   3.39175  -1.36729          NA          NA
## 76507   21.67145       -2.00418   1.10862  -1.80782          NA          NA
## 23993   11.32434       -1.74899   1.68251  -1.03951          NA          NA
## 19259   26.89530       -7.32684   2.77443  -2.64084          NA          NA

5. Diagnostic plots

This MA plot is useful to observe if the data normalization worked well. Most points are expected to be on the horizontal 0 line, since most genes are not expected to be differentially expressed.

# MA plot
DESeq2::plotMA(object = dds, ylim = c(-5, 5), main = "MA plot virgin-pregnant")

For the p-value distribution, we expect to see a peak around low p-values and a uniform distribution at P-values above 0.1.

# P-value distribution

ggplot(data = as.data.frame(DEresults), aes(x = pvalue)) + 
  geom_histogram(bins = 100)+
  ggtitle("p-value distribution virgin-pregnant") +
  theme(plot.title = element_text(hjust = 0.5))

# PCA plot

rld <- rlog(dds)
DESeq2::plotPCA(rld, ntop = 500, intgroup = 'Status') + 
  ylim(-50, 50) + theme_bw()+
  ggtitle("PCA plot virgin-pregnant")+
  theme(plot.title = element_text(hjust = 0.5))
## using ntop=500 top features by variance

### 6. Visualization

library(org.Mm.eg.db) # Genome wide annotation for mice 
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
## 
# Save the DEG results in a dataframe
DEG.vir.preg <- data.frame(DEresults$log2FoldChange, DEresults$lfcSE, DEresults$pvalue, 
                           DEresults$padj, rownames(DEresults)) %>%   `colnames<-`(c('log2FoldChange', 'lfcSE', 'pvalue', 'padj', 'EntrezGeneID'))
# Map the EntrezGeneID column to a reference mice database

DEG.vir.preg$GenSymbl<-mapIds(org.Mm.eg.db,keys=DEG.vir.preg$EntrezGeneID,column="SYMBOL", 
                              keytype="ENTREZID", multiVals="first") 
## 'select()' returned 1:1 mapping between keys and columns
head(DEG.vir.preg)
##   log2FoldChange      lfcSE       pvalue         padj EntrezGeneID GenSymbl
## 1     -1.9802400 0.15327961 3.510653e-38 5.959684e-34        77097    Tanc2
## 2      1.4642702 0.12481127 8.749830e-32 7.426856e-28        14758    Gpm6b
## 3      0.8029683 0.06877411 1.701724e-31 9.629491e-28        20656     Sod2
## 4     -0.7364281 0.06931450 2.293733e-26 9.734604e-23       101214    Tra2a
## 5      0.7652548 0.08135380 5.129172e-21 1.741456e-17        72157     Pgm1
## 6      2.1195380 0.24194413 1.944831e-18 5.502576e-15       433022   Plcxd2

Volcano plot

The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners. We are going to add a column to the data frame (named diffexpressed) to specify if they are UP- or DOWN- regulated (which depends if log2FoldChange is positive or negative).

# Set the thesholds for being up or down regulated
F2CLim <- 2
PadjLim <- 0.05

A log2 fold change of 2 corresponds to a 4-fold increase in gene expression, which can be considered a substantial change that reflects meaningful biological shifts. Moreover, a p-value of 0.05 is a common accepted significance level. If the changes are below the 0.05 threshold, it means they are statistically significant and not due to random chance.

DEG.vir.preg$diffexpressed <- "NO" # First we categorized all genes into not differencially expressed

# if log2Foldchange > F2CLim and Padj < PadjLim, set as "UP" 
DEG.vir.preg$diffexpressed[DEG.vir.preg$log2FoldChange > F2CLim  & DEG.vir.preg$padj < PadjLim ] <- "UP"

# if log2Foldchange < -F2CLim and Padj < PadjLim, set as "DOWN"
DEG.vir.preg$diffexpressed[DEG.vir.preg$log2FoldChange < -F2CLim & DEG.vir.preg$padj < PadjLim] <- "DOWN"

Now we perform a Volcano plot and color-code it based on if the genes are up or down regulated:

p <- ggplot(data=DEG.vir.preg, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed)) + 
  geom_point() + theme_minimal()


p2 <- p + geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
        geom_hline(yintercept=-log10(PadjLim), col="red")

## Change point color 

p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))

mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p3 <- p2 + scale_colour_manual(values = mycolors)+
          ggtitle("Volcano plot virgin-pregnant") +
          theme(plot.title = element_text(hjust = 0.5))

p3

Next, the same Volcano plot but with the names of the genes next to each point will be assessed:

# Create a new column "delabel" that contains the name of genes differentially expressed (NA in case they are not)
DEG.vir.preg$delabel <- NA
DEG.vir.preg$delabel[DEG.vir.preg$diffexpressed != "NO"] <- DEG.vir.preg$GenSymbl[DEG.vir.preg$diffexpressed != "NO"]

#Organize the labels using the "ggrepel" package and the geom_text_repel() function

library(ggrepel)

ggplot(data=DEG.vir.preg, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed, label=delabel)) +
        geom_point() + 
        theme_minimal() +
        geom_text_repel() +
        scale_color_manual(values=c("blue", "black", "red")) +
        geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
        geom_hline(yintercept=-log10(PadjLim), col="red")+
        ggtitle("Volcano plot virgin-pregnant") +
        theme(plot.title = element_text(hjust = 0.5))

We save the results:

UPR.vir.preg <- DEG.vir.preg[DEG.vir.preg$diffexpressed=="UP",]
DWR.vir.preg <- DEG.vir.preg[DEG.vir.preg$diffexpressed=="DOWN",]
head(UPR.vir.preg, n=3)
##    log2FoldChange     lfcSE       pvalue         padj EntrezGeneID GenSymbl
## 6        2.119538 0.2419441 1.944831e-18 5.502576e-15       433022   Plcxd2
## 16       2.010375 0.2673337 5.473635e-14 5.807526e-11       170706   Tmem37
## 20       7.700666 1.0743842 7.637067e-13 6.482342e-10        14077    Fabp3
##    diffexpressed delabel
## 6             UP  Plcxd2
## 16            UP  Tmem37
## 20            UP   Fabp3

EXERCISE 4

Perform enrichment analysis of both UP- and DOWN-regulated genes and represent results for each ontology graphicall.

First of all we retrieve the mouse genes from the reference library and we map them to our EntrezGeneID to ensure the IDs used in downstream analysis are valid and consistent with the genome annotation database.

# Define the universe genes for enrichment analysis
df <- as.data.frame(org.Mm.egGO)
go_gene_list <- unique(sort(df$gene_id))  # Mouse universe genes
head(df)
##   gene_id      go_id Evidence Ontology
## 1   11287 GO:0007566      IGI       BP
## 2   11287 GO:0010466      IEA       BP
## 3   11298 GO:0006474      ISO       BP
## 4   11298 GO:0007623      IBA       BP
## 5   11298 GO:0007623      ISO       BP
## 6   11298 GO:0009416      IBA       BP

DOWN-REGULATED

# Obtain ENTREZ ID of the genes previously extracted 
ent.Gene <- mapIds(
  org.Mm.eg.db,                         
  keys = DWR.vir.preg$EntrezGeneID,      
  keytype = "ENTREZID",                  
  column = "ENTREZID",                   
  multiVals = "first"                    
)
ego <- enrichGO(gene          = na.omit(ent.Gene),
                universe      = go_gene_list,
                OrgDb         = org.Mm.eg.db,
                ont           = "all",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.10,
                qvalueCutoff  = 0.05,
                readable      = TRUE)

head(ego)# Reference
##            ONTOLOGY         ID                                   Description
## GO:0050767       BP GO:0050767                    regulation of neurogenesis
## GO:0014009       BP GO:0014009                      glial cell proliferation
## GO:0014013       BP GO:0014013                     regulation of gliogenesis
## GO:0014015       BP GO:0014015            positive regulation of gliogenesis
## GO:0030856       BP GO:0030856 regulation of epithelial cell differentiation
## GO:1903034       BP GO:1903034            regulation of response to wounding
##            GeneRatio   BgRatio RichFactor FoldEnrichment    zScore       pvalue
## GO:0050767    19/184 494/28905 0.03846154       6.042015  9.047164 4.719877e-10
## GO:0014009     9/184  79/28905 0.11392405      17.896602 12.036738 1.978310e-09
## GO:0014013    10/184 143/28905 0.06993007      10.985482  9.581105 2.938305e-08
## GO:0014015     8/184  93/28905 0.08602151      13.513324  9.674211 1.486042e-07
## GO:0030856    10/184 173/28905 0.05780347       9.080485  8.532291 1.772410e-07
## GO:1903034    10/184 176/28905 0.05681818       8.925704  8.441547 2.080162e-07
##                p.adjust       qvalue
## GO:0050767 1.110587e-06 9.285736e-07
## GO:0014009 2.327482e-06 1.946033e-06
## GO:0014013 2.304611e-05 1.926910e-05
## GO:0014015 8.157702e-05 6.820742e-05
## GO:0030856 8.157702e-05 6.820742e-05
## GO:1903034 8.157702e-05 6.820742e-05
##                                                                                                                       geneID
## GO:0050767 Abcc8/Map1b/Id1/Cxcr4/Il33/Dpysl5/Hey2/Ptprz1/Tnfrsf1b/Cysltr1/Ntn1/Fgfr3/Hey1/Sema3b/Dscam/Wnt7b/Plag1/Trpc6/Ptn
## GO:0014009                                                               Abcc8/Il33/Areg/Cysltr1/Ntn1/Lepr/Cntnap2/Plag1/Ptn
## GO:0014013                                                     Abcc8/Cxcr4/Il33/Ptprz1/Tnfrsf1b/Cysltr1/Ntn1/Fgfr3/Plag1/Ptn
## GO:0014015                                                                 Cxcr4/Il33/Ptprz1/Tnfrsf1b/Cysltr1/Ntn1/Plag1/Ptn
## GO:0030856                                                           Id1/Dmbt1/Prom1/Hey2/Eppk1/Fgfr3/Hey1/Cited1/Reg3g/Proc
## GO:1903034                                                              Tspan8/Vwf/Fgb/Cxcr4/Il33/Eppk1/Mmrn1/Reg3g/Proc/Ptn
##            Count
## GO:0050767    19
## GO:0014009     9
## GO:0014013    10
## GO:0014015     8
## GO:0030856    10
## GO:1903034    10
## Remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)
head(summary(ego2), n=3)
## Warning in summary(ego2): summary method to convert the object to data.frame is
## deprecated, please use as.data.frame instead.
##            ONTOLOGY         ID                                   Description
## GO:0050767       BP GO:0050767                    regulation of neurogenesis
## GO:0014009       BP GO:0014009                      glial cell proliferation
## GO:0030856       BP GO:0030856 regulation of epithelial cell differentiation
##            GeneRatio   BgRatio RichFactor FoldEnrichment    zScore       pvalue
## GO:0050767    19/184 494/28905 0.03846154       6.042015  9.047164 4.719877e-10
## GO:0014009     9/184  79/28905 0.11392405      17.896602 12.036738 1.978310e-09
## GO:0030856    10/184 173/28905 0.05780347       9.080485  8.532291 1.772410e-07
##                p.adjust       qvalue
## GO:0050767 1.110587e-06 9.285736e-07
## GO:0014009 2.327482e-06 1.946033e-06
## GO:0030856 8.157702e-05 6.820742e-05
##                                                                                                                       geneID
## GO:0050767 Abcc8/Map1b/Id1/Cxcr4/Il33/Dpysl5/Hey2/Ptprz1/Tnfrsf1b/Cysltr1/Ntn1/Fgfr3/Hey1/Sema3b/Dscam/Wnt7b/Plag1/Trpc6/Ptn
## GO:0014009                                                               Abcc8/Il33/Areg/Cysltr1/Ntn1/Lepr/Cntnap2/Plag1/Ptn
## GO:0030856                                                           Id1/Dmbt1/Prom1/Hey2/Eppk1/Fgfr3/Hey1/Cited1/Reg3g/Proc
##            Count
## GO:0050767    19
## GO:0014009     9
## GO:0030856    10

Visualization of DOWN-REGULATED genes

Dot plot

ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)

# Dotplot with improved readability

dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5 
  facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
  ggtitle("Dotplot for DOWN-regulated genes") + 
  theme(
    panel.spacing = unit(1, "lines"), # Space between facets
    axis.text.y = element_text( hjust = 1, size = 7), 
    plot.title = element_text(hjust = 0.5) # Put title in the center
  )

Gene concept network:

cnetplot(ego2, showCategory = 3, foldChange = DWR.vir.preg$log2FoldChange, circular = TRUE, colorEdge = TRUE)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
##  The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
##  The colorEdge parameter will be removed in the next version.

UpSet plot:

p <- upsetplot(ego2, n = 5)

p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) + 
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+
  ggtitle("Intersection of Top 5 GO Terms for DOWN-regulated") +
  theme(plot.title = element_text(hjust = 0.5))

Tree plot:

edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for DOWN-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
##  The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot for DOWN-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

UP-REGULATED

# Obtain ENTREZ ID of the genes previously extracted 
ent.Gene <- mapIds(
  org.Mm.eg.db,                         
  keys = UPR.vir.preg$EntrezGeneID, # Specify up-regulated     
  keytype = "ENTREZID",                  
  column = "ENTREZID",                   
  multiVals = "first"                    
)
ego <- enrichGO(gene          = na.omit(ent.Gene),
                universe      = go_gene_list,
                OrgDb         = org.Mm.eg.db,
                ont           = "all",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.10,
                qvalueCutoff  = 0.05,
                readable      = TRUE)

head(ego)# Reference
##            ONTOLOGY         ID                               Description
## GO:0015718       BP GO:0015718             monocarboxylic acid transport
## GO:0015711       BP GO:0015711                   organic anion transport
## GO:0046942       BP GO:0046942                 carboxylic acid transport
## GO:0015849       BP GO:0015849                    organic acid transport
## GO:0015909       BP GO:0015909           long-chain fatty acid transport
## GO:1905954       BP GO:1905954 positive regulation of lipid localization
##            GeneRatio   BgRatio RichFactor FoldEnrichment    zScore       pvalue
## GO:0015718     10/96 182/28905 0.05494505      16.543613 12.142923 5.396754e-10
## GO:0015711     12/96 451/28905 0.02660754       8.011364  8.663026 3.443028e-08
## GO:0046942     11/96 374/28905 0.02941176       8.855699  8.826990 4.853190e-08
## GO:0015849     11/96 376/28905 0.02925532       8.808594  8.797798 5.122677e-08
## GO:0015909      5/96  68/28905 0.07352941      22.139246 10.074405 3.213174e-06
## GO:1905954      6/96 126/28905 0.04761905      14.337798  8.661279 4.094145e-06
##                p.adjust       qvalue
## GO:0015718 9.066547e-07 7.231651e-07
## GO:0015711 2.151524e-05 1.716097e-05
## GO:0046942 2.151524e-05 1.716097e-05
## GO:0015849 2.151524e-05 1.716097e-05
## GO:0015909 1.079626e-03 8.611306e-04
## GO:1905954 1.146361e-03 9.143590e-04
##                                                                                       geneID
## GO:0015718                 Fabp3/Slc26a7/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36
## GO:0015711 Fabp3/Slc26a7/Slc38a3/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36/Slc52a2
## GO:0046942         Fabp3/Slc26a7/Slc38a3/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36
## GO:0015849         Fabp3/Slc26a7/Slc38a3/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36
## GO:0015909                                                       Fabp3/Drd4/Acsl1/Plin2/Cd36
## GO:1905954                                                   Fabp3/Lpl/Dab2/Acsl1/Plin2/Cd36
##            Count
## GO:0015718    10
## GO:0015711    12
## GO:0046942    11
## GO:0015849    11
## GO:0015909     5
## GO:1905954     6
## remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)
head(summary(ego2), n=2)
## Warning in summary(ego2): summary method to convert the object to data.frame is
## deprecated, please use as.data.frame instead.
##            ONTOLOGY         ID                   Description GeneRatio
## GO:0015718       BP GO:0015718 monocarboxylic acid transport     10/96
## GO:0015849       BP GO:0015849        organic acid transport     11/96
##              BgRatio RichFactor FoldEnrichment    zScore       pvalue
## GO:0015718 182/28905 0.05494505      16.543613 12.142923 5.396754e-10
## GO:0015849 376/28905 0.02925532       8.808594  8.797798 5.122677e-08
##                p.adjust       qvalue
## GO:0015718 9.066547e-07 7.231651e-07
## GO:0015849 2.151524e-05 1.716097e-05
##                                                                               geneID
## GO:0015718         Fabp3/Slc26a7/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36
## GO:0015849 Fabp3/Slc26a7/Slc38a3/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36
##            Count
## GO:0015718    10
## GO:0015849    11

Visualization of UP-REGULATED genes

Dot plot:

ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)

# Dotplot with improved readability

dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5 
  facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
  ggtitle("Dotplot for UP-regulated genes") + 
  theme(
    panel.spacing = unit(1, "lines"), # Space between facets
    axis.text.y = element_text( hjust = 1, size = 7), # Rotate y-axis labels
    plot.title = element_text(hjust = 0.5) # Put title in the center
  )

Gene concept network:

cnetplot(ego2, showCategory = 3, foldChange = UPR.vir.preg$log2FoldChange, circular = TRUE, colorEdge = TRUE, max.overlaps = 100)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
##  The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
##  The colorEdge parameter will be removed in the next version.

UpSet plot:

p <- upsetplot(ego2, n = 5)

p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) + 
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+
            ggtitle("Intersection of Top 5 GO Terms for UP-regulated genes") +
          theme(plot.title = element_text(hjust = 0.5))

Tree plot:

edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for UP-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
##  The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot for UP-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

EXERCISE 6

Perform the same analyses as tasks 3, 4, and 5, but compare virgin and lactating mice.

metadata <- read.delim("sample_metadata.txt", header = TRUE, sep = "\t") # Load metadata again
subSamplInfo <- metadata[metadata$Status %in% c("virgin", "lactate"), ] # Filter to virgin and lactating mice
# Get the column names from counts excluding the first two columns
counts_columns <- colnames(counts)[-c(1, 2)] 

# Get the filenames from subSamplInfo
subsampl_files <- subSamplInfo$FileName

# Find the intersection of counts columns and filenames
matching_columns <- intersect(counts_columns, subsampl_files)
subCOUNTS <- counts[, colnames(counts) %in% matching_columns] # Not include the first 2 columns
rownames(subCOUNTS) <- counts$EntrezGeneID  # Set EntrezGeneID as row names

See if the rows of the metadata are aligned with the columns of the counts matrix:

all(colnames(subCOUNTS) == subsampl_files) # has to be TRUE
## [1] TRUE
subSamplInfo$Status <- as.factor(subSamplInfo$Status) # Status treated as a factor

DEA virgin-lactate

# construct a DESeqDataset object
dds <- DESeqDataSetFromMatrix(
  countData = subCOUNTS, 
  colData = subSamplInfo, 
  design = ~ Status
)
# Pre-filtering
dds <- dds[ rowSums(DESeq2::counts(dds)) > 3, ]

# We set the factor level to virgin mice
dds$Status <- relevel(dds$Status , ref = "virgin")
# Run DESeq

library(BiocParallel)

param <- MulticoreParam()
register(param)
dds <- DESeq(dds, parallel = T)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 1 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 1 workers
DEresults<- results(dds)
#sort results by increasing p-value
DEresults <- DEresults[order(DEresults$pvalue),]
DEresults
## log2 fold change (MLE): Status lactate vs virgin 
## Wald test p-value: Status lactate vs virgin 
## DataFrame with 19125 rows and 6 columns
##           baseMean log2FoldChange     lfcSE         stat      pvalue
##          <numeric>      <numeric> <numeric>    <numeric>   <numeric>
## 59095      499.919       -3.74548  0.286908     -13.0546 5.98222e-39
## 67111      395.788        2.23399  0.171645      13.0152 1.00335e-38
## 170459     483.021        2.52916  0.200609      12.6074 1.92263e-36
## 77697      236.771        1.63310  0.148234      11.0170 3.16256e-28
## 93842      338.688       -2.43362  0.222782     -10.9238 8.87127e-28
## ...            ...            ...       ...          ...         ...
## 68867  3158.769336    1.06118e-04  0.909386  1.16692e-04    0.999907
## 268354    0.785158    2.18907e-04  2.644676  8.27729e-05    0.999934
## 666202    1.850182    6.53647e-05  1.083369  6.03347e-05    0.999952
## 76507    15.145366   -4.71345e+00  1.464829 -3.21774e+00          NA
## 19259    23.718074   -4.44241e+00  2.481774 -1.79001e+00          NA
##               padj
##          <numeric>
## 59095  8.47729e-35
## 67111  8.47729e-35
## 170459 1.08295e-32
## 77697  1.33602e-24
## 93842  2.99813e-24
## ...            ...
## 68867     0.999907
## 268354          NA
## 666202          NA
## 76507           NA
## 19259           NA

Let’s plot the diagnostic plots explained before:

# MA plot
DESeq2::plotMA(object = dds, ylim = c(-5, 5), main = "MA plot virgin-lactate")

# P-value distribution
ggplot(data = as.data.frame(DEresults), aes(x = pvalue)) + 
  geom_histogram(bins = 100)+
  ggtitle("p-vale distribution virgin-lactate") +
  theme(plot.title = element_text(hjust = 0.5))

rld <- rlog(dds)
DESeq2::plotPCA(rld, ntop = 500, intgroup = 'Status') + 
  ylim(-50, 50) + theme_bw()+
  ggtitle("PCA plot virgin-lactate")
## using ntop=500 top features by variance

  theme(plot.title = element_text(hjust = 0.5))
## List of 1
##  $ plot.title:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0.5
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

Visualization:

library(org.Mm.eg.db) # Genome wide annotation for mice 
DEG.vir.lact <- data.frame(DEresults$log2FoldChange, DEresults$lfcSE, DEresults$pvalue, 
                           DEresults$padj, rownames(DEresults)) %>%   `colnames<-`(c('log2FoldChange', 'lfcSE', 'pvalue', 'padj', 'EntrezGeneID'))
DEG.vir.lact$GenSymbl<-mapIds(org.Mm.eg.db,keys=DEG.vir.lact$EntrezGeneID,column="SYMBOL", 
                              keytype="ENTREZID", multiVals="first") 
## 'select()' returned 1:1 mapping between keys and columns
head(DEG.vir.lact)
##   log2FoldChange      lfcSE       pvalue         padj EntrezGeneID GenSymbl
## 1     -3.7454754 0.28690844 5.982219e-39 8.477291e-35        59095    Fxyd6
## 2      2.2339923 0.17164543 1.003348e-38 8.477291e-35        67111     Naaa
## 3      2.5291587 0.20060927 1.922630e-36 1.082953e-32       170459   Stard4
## 4      1.6330980 0.14823370 3.162558e-28 1.336023e-24        77697     Mmab
## 5     -2.4336216 0.22278175 8.871268e-28 2.998134e-24        93842    Igsf9
## 6      0.9775204 0.09243579 3.886413e-26 9.978270e-23        74252    Armc1
  • Volcano plot:
DEG.vir.lact$diffexpressed <- "NO"
# if log2Foldchange > F2CLim and Padj < PadjLim, set as "UP" 
DEG.vir.lact$diffexpressed[DEG.vir.lact$log2FoldChange > F2CLim  & DEG.vir.lact$padj < PadjLim ] <- "UP"
# if log2Foldchange < -F2CLim and Padj < PadjLim, set as "DOWN"
DEG.vir.lact$diffexpressed[DEG.vir.lact$log2FoldChange < -F2CLim & DEG.vir.lact$padj < PadjLim] <- "DOWN"
p <- ggplot(data=DEG.vir.preg, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed)) + 
  geom_point() + theme_minimal()

p2 <- p + geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
        geom_hline(yintercept=-log10(PadjLim), col="red")

## Change point color 

p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))

mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p3 <- p2 + scale_colour_manual(values = mycolors)+
          ggtitle("Volcano plot virgin-lactate") +
          theme(plot.title = element_text(hjust = 0.5))

p3
## Warning: Removed 2244 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Plot with the name of genes beside the points

# Create a new column "delabel"
DEG.vir.lact$delabel <- NA
DEG.vir.lact$delabel[DEG.vir.lact$diffexpressed != "NO"] <- DEG.vir.lact$GenSymbl[DEG.vir.lact$diffexpressed != "NO"]

# Organize the labels using the "ggrepel" package and geom_text_repel() function

library(ggrepel)

ggplot(data=DEG.vir.lact, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed, label=delabel)) +
        geom_point() + 
        theme_minimal() +
        geom_text_repel() +
        scale_color_manual(values=c("blue", "black", "red")) +
        geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
        geom_hline(yintercept=-log10(PadjLim), col="red") +
        ggtitle("Volcano plot virgin-lactate") +
        theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 2227 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18456 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 629 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Save the results:

UPR.vir.lact <- DEG.vir.lact[DEG.vir.lact$diffexpressed=="UP",]
DWR.vir.lact <- DEG.vir.lact[DEG.vir.lact$diffexpressed=="DOWN",]

Enrichment analysis virgin-lactate

# Define the universe genes for enrichment analysis
df <- as.data.frame(org.Mm.egGO)
go_gene_list <- unique(sort(df$gene_id))  # Mouse universe genes

DOWN-REGULATED

# Obtain ENTREZ ID of the genes previously extracted 
ent.Gene <- mapIds(
  org.Mm.eg.db,                         
  keys = DWR.vir.lact$EntrezGeneID,   # Down regulated Virgin-lactating   
  keytype = "ENTREZID",                  
  column = "ENTREZID",                   
  multiVals = "first"                    
)
ego <- enrichGO(gene          = na.omit(ent.Gene),
                universe      = go_gene_list,
                OrgDb         = org.Mm.eg.db,
                ont           = "all",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.10,
                qvalueCutoff  = 0.05,
                readable      = TRUE)

head(ego)# Reference
##            ONTOLOGY         ID                        Description GeneRatio
## GO:0050767       BP GO:0050767         regulation of neurogenesis    24/308
## GO:0003205       BP GO:0003205        cardiac chamber development    12/308
## GO:0072001       BP GO:0072001           renal system development    16/308
## GO:0015850       BP GO:0015850 organic hydroxy compound transport    15/308
## GO:0010038       BP GO:0010038              response to metal ion    15/308
## GO:0046879       BP GO:0046879                  hormone secretion    17/308
##              BgRatio RichFactor FoldEnrichment   zScore       pvalue
## GO:0050767 494/28905 0.04858300       4.559388 8.281131 8.577351e-10
## GO:0003205 200/28905 0.06000000       5.630844 6.820096 1.778030e-06
## GO:0072001 359/28905 0.04456825       4.182614 6.297275 1.827804e-06
## GO:0015850 317/28905 0.04731861       4.440729 6.392670 1.849071e-06
## GO:0010038 318/28905 0.04716981       4.426764 6.376870 1.922227e-06
## GO:0046879 405/28905 0.04197531       3.939274 6.182145 1.963552e-06
##                p.adjust       qvalue
## GO:0050767 2.701008e-06 2.110931e-06
## GO:0003205 9.242665e-04 7.223463e-04
## GO:0072001 9.242665e-04 7.223463e-04
## GO:0015850 9.242665e-04 7.223463e-04
## GO:0010038 9.242665e-04 7.223463e-04
## GO:0046879 9.242665e-04 7.223463e-04
##                                                                                                                                                  geneID
## GO:0050767 Cit/Bmp2/Id1/Tnfrsf1b/Il33/Abcc8/Dpysl5/Heyl/Wnt7b/Trpc6/Rassf10/Cysltr1/Hey1/Hey2/Sema3b/Fgfr3/Myb/Bhlhe41/Fgf13/Cxcr4/Bex1/Lpar3/Ptn/Dscam
## GO:0003205                                                                              Stra6/Bmp2/Cpe/Heyl/Smad7/Hey1/Gata6/Hey2/Tnnt2/Cxcr4/Bmp5/Rbp4
## GO:0072001                                                        Stra6/Bmp2/Id3/Heyl/Wnt7b/Lgr5/Smad7/Hey1/Tacstd2/Gcnt4/Upk3a/Hc/Prom1/Cxcr4/Cfh/Rbp4
## GO:0015850                                               Stra6/Slc22a3/Stra6l/Kcnb1/Slco1a5/Nat8l/Lgals3/C1qtnf1/Abcc3/Myb/Slco1a4/Syt15/Syt8/Syt5/Rbp4
## GO:0010038                                                        Alox15/Abcc8/Cybrd1/Kcnb1/Crip1/Adcy7/Fgb/Tnnt2/Hamp/Chp2/Slc30a3/Fgg/Syt15/Syt8/Syt5
## GO:0046879                                                F2rl1/Il11/Lepr/Tnfsf11/Abcc8/Cpe/Kcnb1/C1qtnf1/Kcnj11/Fgb/Il1rn/Myb/Ildr2/Trh/Ffar4/Fgg/Rbp4
##            Count
## GO:0050767    24
## GO:0003205    12
## GO:0072001    16
## GO:0015850    15
## GO:0010038    15
## GO:0046879    17
## remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)
head(summary(ego2), n=2)
## Warning in summary(ego2): summary method to convert the object to data.frame is
## deprecated, please use as.data.frame instead.
##            ONTOLOGY         ID                 Description GeneRatio   BgRatio
## GO:0050767       BP GO:0050767  regulation of neurogenesis    24/308 494/28905
## GO:0003205       BP GO:0003205 cardiac chamber development    12/308 200/28905
##            RichFactor FoldEnrichment   zScore       pvalue     p.adjust
## GO:0050767   0.048583       4.559388 8.281131 8.577351e-10 2.701008e-06
## GO:0003205   0.060000       5.630844 6.820096 1.778030e-06 9.242665e-04
##                  qvalue
## GO:0050767 2.110931e-06
## GO:0003205 7.223463e-04
##                                                                                                                                                  geneID
## GO:0050767 Cit/Bmp2/Id1/Tnfrsf1b/Il33/Abcc8/Dpysl5/Heyl/Wnt7b/Trpc6/Rassf10/Cysltr1/Hey1/Hey2/Sema3b/Fgfr3/Myb/Bhlhe41/Fgf13/Cxcr4/Bex1/Lpar3/Ptn/Dscam
## GO:0003205                                                                              Stra6/Bmp2/Cpe/Heyl/Smad7/Hey1/Gata6/Hey2/Tnnt2/Cxcr4/Bmp5/Rbp4
##            Count
## GO:0050767    24
## GO:0003205    12

Visualization of DOWN-REGULATED genes

Dot plot:

ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)

# Dotplot with improved readability

dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5 
  facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
  ggtitle("Dotplot for DOWN-regulated genes") + 
  theme(
    panel.spacing = unit(1, "lines"), # Space between facets
    axis.text.y = element_text( hjust = 1, size = 7), # Rotate y-axis labels
    plot.title = element_text(hjust = 0.5) # Put title in the center
  )

Gene concept network:

cnetplot(ego2, showCategory = 3, foldChange = DWR.vir.lact$log2FoldChange, circular = TRUE, colorEdge = TRUE)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
##  The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
##  The colorEdge parameter will be removed in the next version.
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

p <- upsetplot(ego2, n = 5)

p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) + 
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+
            ggtitle("Intersection of Top 5 GO Terms for DOWN-regulated genes") +
          theme(plot.title = element_text(hjust = 0.5))

Tree plot:

edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for DOWN-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
##  The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot of Enriched Terms for DOWN-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

UP-REGULATED

# Obtain ENTREZ ID of the genes previously extracted 
ent.Gene <- mapIds(
  org.Mm.eg.db,                         
  keys = UPR.vir.lact$EntrezGeneID,      
  keytype = "ENTREZID",                  
  column = "ENTREZID",                   
  multiVals = "first"                    
)
ego <- enrichGO(gene          = na.omit(ent.Gene),
                universe      = go_gene_list,
                OrgDb         = org.Mm.eg.db,
                ont           = "all",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.10,
                qvalueCutoff  = 0.05,
                readable      = TRUE)

head(ego)# Reference
##            ONTOLOGY         ID                              Description
## GO:0006631       BP GO:0006631             fatty acid metabolic process
## GO:0006633       BP GO:0006633          fatty acid biosynthetic process
## GO:0072330       BP GO:0072330 monocarboxylic acid biosynthetic process
## GO:0046394       BP GO:0046394     carboxylic acid biosynthetic process
## GO:0016053       BP GO:0016053        organic acid biosynthetic process
## GO:0006084       BP GO:0006084             acetyl-CoA metabolic process
##            GeneRatio   BgRatio RichFactor FoldEnrichment   zScore       pvalue
## GO:0006631    32/331 465/28905 0.06881720       6.009551 11.72109 5.956564e-16
## GO:0006633    20/331 162/28905 0.12345679      10.781023 13.43641 3.653093e-15
## GO:0072330    22/331 228/28905 0.09649123       8.426220 12.11646 2.730764e-14
## GO:0046394    24/331 325/28905 0.07384615       6.448710 10.63192 6.549035e-13
## GO:0016053    24/331 328/28905 0.07317073       6.389728 10.56581 7.981248e-13
## GO:0006084    10/331  36/28905 0.27777778      24.257301 15.02803 6.610351e-12
##                p.adjust       qvalue
## GO:0006631 1.829857e-12 1.522999e-12
## GO:0006633 5.611151e-12 4.670191e-12
## GO:0072330 2.796302e-11 2.327377e-11
## GO:0046394 4.903679e-10 4.081358e-10
## GO:0016053 4.903679e-10 4.081358e-10
## GO:0006084 3.384500e-09 2.816937e-09
##                                                                                                                                                                                                        geneID
## GO:0006631 Naaa/Gstm7/Fabp3/Lpl/Abcb11/Olah/Insig1/Acsl1/Acacb/Acss2/Scd1/Fads1/Elovl5/Fasn/Scd4/Elovl6/Scd2/Acaca/Ucp3/Pla2g15/Ehhadh/Acly/Acat2/Prkaa2/Ptgs1/Pla2g4f/Acat3/Ppargc1a/Elovl1/Scd3/Acot6/Lpin1
## GO:0006633                                                                              Gstm7/Lpl/Olah/Insig1/Acacb/Acss2/Scd1/Fads1/Elovl5/Fasn/Scd4/Elovl6/Scd2/Acaca/Acly/Prkaa2/Ptgs1/Pla2g4f/Elovl1/Scd3
## GO:0072330                                                                 Stard4/Gstm7/Lpl/Olah/Insig1/Acacb/Acss2/Scd1/Fads1/Elovl5/Fasn/Scd4/Elovl6/Scd2/Acaca/Hoga1/Acly/Prkaa2/Ptgs1/Pla2g4f/Elovl1/Scd3
## GO:0046394                                                   Stard4/Gstm7/Lpl/Olah/Insig1/Acacb/Acss2/Scd1/Fads1/Elovl5/Fasn/Scd4/Elovl6/Scd2/Acaca/Aass/Hoga1/Acly/Prkaa2/Ptgs1/Aldh18a1/Pla2g4f/Elovl1/Scd3
## GO:0016053                                                   Stard4/Gstm7/Lpl/Olah/Insig1/Acacb/Acss2/Scd1/Fads1/Elovl5/Fasn/Scd4/Elovl6/Scd2/Acaca/Aass/Hoga1/Acly/Prkaa2/Ptgs1/Aldh18a1/Pla2g4f/Elovl1/Scd3
## GO:0006084                                                                                                                                               Acacb/Acss2/Pmvk/Fasn/Acaca/Mpc1/Dlat/Acly/Mpc2/Pdhb
##            Count
## GO:0006631    32
## GO:0006633    20
## GO:0072330    22
## GO:0046394    24
## GO:0016053    24
## GO:0006084    10
## remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)

Visualization of up-regulated genes

Dot plot:

ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)

# Dotplot with improved readability

dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5 
  facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
  ggtitle("Dotplot for UP-regulated genes") + 
  theme(
    panel.spacing = unit(1, "lines"), # Space between facets
    axis.text.y = element_text( hjust = 1, size = 7), 
    plot.title = element_text(hjust = 0.5) # Put title in the center
  )

Gene concept network:

cnetplot(ego2, showCategory = 3, foldChange = UPR.vir.lact$log2FoldChange, circular = TRUE, colorEdge = TRUE)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
##  The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
##  The colorEdge parameter will be removed in the next version.
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

UpSet plot:

p <- upsetplot(ego2, n = 5)

p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) + 
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+

    ggtitle("Intersection of Top 5 GO Terms for UP-regulated genes") +
    theme(plot.title = element_text(hjust = 0.5))

Tree plot:

edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for UP-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
##  The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot of Enriched Terms for UP-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

EXERCISE 7

Perform the same analyses as tasks 3, 4, and 5, but compare pregnant and lactating mice

metadata <- read.delim("sample_metadata.txt", header = TRUE, sep = "\t") # Load again the metadata
subSamplInfo <- metadata[metadata$Status %in% c("pregnant", "lactate"), ] # Filter for pregnant and lactate mice
# Get the column names from counts excluding the first two columns
counts_columns <- colnames(counts)[-c(1, 2)] 

# Get the filenames from subSamplInfo
subsampl_files <- subSamplInfo$FileName

# Find the intersection of counts columns and filenames
matching_columns <- intersect(counts_columns, subsampl_files)
subCOUNTS <- counts[, colnames(counts) %in% matching_columns] # Not include the first 2 columns
rownames(subCOUNTS) <- counts$EntrezGeneID  # Set EntrezGeneID as row names

See if the rows of the metadata are aligned with the columns of the counts matrix:

all(colnames(subCOUNTS) == subsampl_files) # has to be TRUE
## [1] TRUE
subSamplInfo$Status <- as.factor(subSamplInfo$Status) # Status treated as a factor

DEA pregnant -lactate

# construct a DESeqDataset object
dds <- DESeqDataSetFromMatrix(
  countData = subCOUNTS, 
  colData = subSamplInfo, 
  design = ~ Status
)
# Pre-filtering

dds <- dds[ rowSums(DESeq2::counts(dds)) > 3, ]

# We set the factor level to pregnant mice in this case
dds$Status <- relevel(dds$Status , ref = "pregnant")
# Run DESeq

library(BiocParallel)

param <- MulticoreParam()

register(param)
dds <- DESeq(dds, parallel = T)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 1 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 1 workers
DEresults<- results(dds)
#sort results by increasing p-value
DEresults <- DEresults[order(DEresults$pvalue),]
DEresults
## log2 fold change (MLE): Status lactate vs pregnant 
## Wald test p-value: Status lactate vs pregnant 
## DataFrame with 18869 rows and 6 columns
##         baseMean log2FoldChange     lfcSE         stat      pvalue        padj
##        <numeric>      <numeric> <numeric>    <numeric>   <numeric>   <numeric>
## 67111    342.865       2.586276 0.2067976     12.50631 6.89518e-36 1.12426e-31
## 66234    773.961       2.231038 0.1921582     11.61042 3.64816e-31 2.97416e-27
## 69237   1319.425      -1.008621 0.0903078    -11.16870 5.80210e-29 3.15344e-25
## 67952   2047.203      -0.613893 0.0695262     -8.82966 1.04998e-18 4.27999e-15
## 67619    585.578      -1.101617 0.1284619     -8.57544 9.87098e-18 3.21893e-14
## ...          ...            ...       ...          ...         ...         ...
## 219150  395.1892    6.51393e-05  0.190371  0.000342170    0.999727    0.999736
## 17876   131.0867   -1.25242e-04  0.378766 -0.000330658    0.999736    0.999736
## 215866  114.1349   -4.85535e+00  1.910235 -2.541756729          NA          NA
## 18948    16.6089   -6.53002e+00  2.260910 -2.888224910          NA          NA
## 21943    60.0785   -6.95556e+00  2.167159 -3.209530294          NA          NA

Let’s plot the diagnostic plots:

# MA plot
DESeq2::plotMA(object = dds, ylim = c(-5, 5), main = "MA plot pregnant-lactate")

# P-value distribution
ggplot(data = as.data.frame(DEresults), aes(x = pvalue)) + 
  geom_histogram(bins = 100)+
  ggtitle("p-vale distribution pregnant-lactate") +
  theme(plot.title = element_text(hjust = 0.5))

rld <- rlog(dds)
DESeq2::plotPCA(rld, ntop = 500, intgroup = 'Status') + 
  ylim(-50, 50) + theme_bw()+
  ggtitle("PCA plot pregnant-lactate")
## using ntop=500 top features by variance

  theme(plot.title = element_text(hjust = 0.5))
## List of 1
##  $ plot.title:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0.5
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

Visualization:

library(org.Mm.eg.db) # Genome wide annotation for mice 
DEG.preg.lact <- data.frame(DEresults$log2FoldChange, DEresults$lfcSE, DEresults$pvalue, 
                           DEresults$padj, rownames(DEresults)) %>%   `colnames<-`(c('log2FoldChange', 'lfcSE', 'pvalue', 'padj', 'EntrezGeneID'))
DEG.preg.lact$GenSymbl<-mapIds(org.Mm.eg.db,keys=DEG.preg.lact$EntrezGeneID,column="SYMBOL", 
                              keytype="ENTREZID", multiVals="first") # 
## 'select()' returned 1:1 mapping between keys and columns
head(DEG.preg.lact)
##   log2FoldChange      lfcSE       pvalue         padj EntrezGeneID  GenSymbl
## 1      2.5862756 0.20679762 6.895182e-36 1.124259e-31        67111      Naaa
## 2      2.2310379 0.19215823 3.648158e-31 2.974161e-27        66234     Msmo1
## 3     -1.0086211 0.09030779 5.802102e-29 3.153442e-25        69237    Gtpbp4
## 4     -0.6138928 0.06952625 1.049982e-18 4.279991e-15        67952    Tomm20
## 5     -1.1016168 0.12846186 9.870978e-18 3.218926e-14        67619      Nob1
## 6      0.9283026 0.10902737 1.674554e-17 4.550601e-14        68728 Trp53inp2

Volcano plot:

DEG.preg.lact$diffexpressed <- "NO"
# if log2Foldchange > F2CLim and Padj < PadjLim, set as "UP" 
DEG.preg.lact$diffexpressed[DEG.preg.lact$log2FoldChange > F2CLim  & DEG.preg.lact$padj < PadjLim ] <- "UP"
# if log2Foldchange < -F2CLim and Padj < PadjLim, set as "DOWN"
DEG.preg.lact$diffexpressed[DEG.preg.lact$log2FoldChange < -F2CLim & DEG.preg.lact$padj < PadjLim] <- "DOWN"
p <- ggplot(data=DEG.preg.lact, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed)) + 
  geom_point() + theme_minimal()


p2 <- p + geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
        geom_hline(yintercept=-log10(PadjLim), col="red")

## Change point color 

p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))

mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p3 <- p2 + scale_colour_manual(values = mycolors)+
          ggtitle("Volcano plot pregnant-lactate") +
          theme(plot.title = element_text(hjust = 0.5))

p3

# Write down the name of genes beside the points

# Create a new column "delabel"
DEG.preg.lact$delabel <- NA
DEG.preg.lact$delabel[DEG.preg.lact$diffexpressed != "NO"] <- DEG.preg.lact$GenSymbl[DEG.preg.lact$diffexpressed != "NO"]

# Organize the labels nicely using "ggrepel" package and geom_text_repel() function

library(ggrepel)

ggplot(data=DEG.preg.lact, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed, label=delabel)) +
        geom_point() + 
        theme_minimal() +
        geom_text_repel() +
        scale_color_manual(values=c("blue", "black", "red")) +
        geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
        geom_hline(yintercept=-log10(PadjLim), col="red") +
        ggtitle("Volcano plot pregnant-lactate") +
        theme(plot.title = element_text(hjust = 0.5))

Save the results:

UPR.preg.lact <- DEG.preg.lact[DEG.preg.lact$diffexpressed=="UP",]
DWR.preg.lact <- DEG.preg.lact[DEG.preg.lact$diffexpressed=="DOWN",]

Enrichment analysis pregnant-lactate

# Define the universe genes for enrichment analysis
df <- as.data.frame(org.Mm.egGO)
go_gene_list <- unique(sort(df$gene_id))  # Mouse universe genes

DOWN-REGULATED

# Obtain ENTREZ ID of the genes previously extracted 
ent.Gene <- mapIds(
  org.Mm.eg.db,                         
  keys = DWR.preg.lact$EntrezGeneID, #down-regulated pregnant-lactate     
  keytype = "ENTREZID",                  
  column = "ENTREZID",                   
  multiVals = "first"                    
)
ego <- enrichGO(gene          = na.omit(ent.Gene),
                universe      = go_gene_list,
                OrgDb         = org.Mm.eg.db,
                ont           = "all",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.10,
                qvalueCutoff  = 0.05,
                readable      = TRUE)

head(ego)# Reference
##            ONTOLOGY         ID
## GO:0050727       BP GO:0050727
## GO:0042531       BP GO:0042531
## GO:0048145       BP GO:0048145
## GO:0030225       BP GO:0030225
## GO:0045649       BP GO:0045649
## GO:0042509       BP GO:0042509
##                                                                Description
## GO:0050727                             regulation of inflammatory response
## GO:0042531 positive regulation of tyrosine phosphorylation of STAT protein
## GO:0048145                          regulation of fibroblast proliferation
## GO:0030225                                      macrophage differentiation
## GO:0045649                        regulation of macrophage differentiation
## GO:0042509          regulation of tyrosine phosphorylation of STAT protein
##            GeneRatio   BgRatio RichFactor FoldEnrichment    zScore       pvalue
## GO:0050727      8/68 411/28905 0.01946472       8.273937  7.212273 5.477737e-06
## GO:0042531      4/68  56/28905 0.07142857      30.362395 10.680178 9.381058e-06
## GO:0048145      5/68 127/28905 0.03937008      16.735178  8.629818 1.263380e-05
## GO:0030225      4/68  62/28905 0.06451613      27.424099 10.114262 1.409768e-05
## GO:0045649      3/68  22/28905 0.13636364      57.964572 12.979364 1.857185e-05
## GO:0042509      4/68  71/28905 0.05633803      23.947804  9.401048 2.416721e-05
##               p.adjust      qvalue
## GO:0050727 0.004737862 0.003448715
## GO:0042531 0.004737862 0.003448715
## GO:0048145 0.004737862 0.003448715
## GO:0030225 0.004737862 0.003448715
## GO:0045649 0.004737862 0.003448715
## GO:0042509 0.004737862 0.003448715
##                                                             geneID Count
## GO:0050727 Ctss/Alox15/Serpine1/Smpdl3b/Csf1r/Osm/Pik3ap1/Dnase1l3     8
## GO:0042531                                      Lif/Csf1r/Osm/Il24     4
## GO:0048145                         Serpine1/Lif/Cd74/Rasgrf1/Ptprv     5
## GO:0030225                                     Csf1/C1qc/Lif/Csf1r     4
## GO:0045649                                           Csf1/C1qc/Lif     3
## GO:0042509                                      Lif/Csf1r/Osm/Il24     4
## remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)

Visualization of DOWN-REGULATED genes

Dot plot:

ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)

# Dotplot with improved readability

dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5 
  facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
  ggtitle("Dotplot for DOWN-regulated genes") + 
  theme(
    panel.spacing = unit(1, "lines"), # Space between facets
    axis.text.y = element_text( hjust = 1, size = 7), # Rotate y-axis labels
    plot.title = element_text(hjust = 0.5) # Put title in the center
  )

Gene concept network:

cnetplot(ego2, showCategory = 3, foldChange = DWR.preg.lact$log2FoldChange, circular = TRUE, colorEdge = TRUE)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
##  The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
##  The colorEdge parameter will be removed in the next version.

UpSet plot:

p <- upsetplot(ego2, n = 5)

p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) + 
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+
            ggtitle("Intersection of Top 5 GO Terms for DOWN-regulated") +
          theme(plot.title = element_text(hjust = 0.5))

Tree plot:

edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for DOWN-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
##  The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot of Enriched Terms for DOWN-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

UP-REGULATED

# Obtain ENTREZ ID of the genes previously extracted 
ent.Gene <- mapIds(
  org.Mm.eg.db,                         
  keys = UPR.preg.lact$EntrezGeneID, #UP-regulated pregnant-lactate     
  keytype = "ENTREZID",                  
  column = "ENTREZID",                   
  multiVals = "first"                    
)
ego <- enrichGO(gene          = na.omit(ent.Gene),
                universe      = go_gene_list,
                OrgDb         = org.Mm.eg.db,
                ont           = "all",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.10,
                qvalueCutoff  = 0.05,
                readable      = TRUE)

head(ego)# Reference
##            ONTOLOGY         ID                              Description
## GO:0006695       BP GO:0006695         cholesterol biosynthetic process
## GO:1902653       BP GO:1902653   secondary alcohol biosynthetic process
## GO:0016126       BP GO:0016126              sterol biosynthetic process
## GO:0006633       BP GO:0006633          fatty acid biosynthetic process
## GO:0046165       BP GO:0046165             alcohol biosynthetic process
## GO:0072330       BP GO:0072330 monocarboxylic acid biosynthetic process
##            GeneRatio   BgRatio RichFactor FoldEnrichment    zScore       pvalue
## GO:0006695      6/74  60/28905 0.10000000       39.06081 14.951452 1.027241e-08
## GO:1902653      6/74  60/28905 0.10000000       39.06081 14.951452 1.027241e-08
## GO:0016126      6/74  66/28905 0.09090909       35.50983 14.219674 1.841865e-08
## GO:0006633      7/74 162/28905 0.04320988       16.87813 10.267288 2.002341e-07
## GO:0046165      6/74 143/28905 0.04195804       16.38915  9.346266 1.853513e-06
## GO:0072330      7/74 228/28905 0.03070175       11.99235  8.442212 1.990582e-06
##                p.adjust       qvalue
## GO:0006695 6.980103e-06 5.401126e-06
## GO:1902653 6.980103e-06 5.401126e-06
## GO:0016126 8.343648e-06 6.456221e-06
## GO:0006633 6.802952e-05 5.264048e-05
## GO:0046165 4.508669e-04 3.488757e-04
## GO:0072330 4.508669e-04 3.488757e-04
##                                                 geneID Count
## GO:0006695           Msmo1/Fdps/Insig1/Lss/Pmvk/Tm7sf2     6
## GO:1902653           Msmo1/Fdps/Insig1/Lss/Pmvk/Tm7sf2     6
## GO:0016126           Msmo1/Fdps/Insig1/Lss/Pmvk/Tm7sf2     6
## GO:0006633 Gstm7/Insig1/Scd2/Elovl6/Acacb/Ptgs1/Elovl5     7
## GO:0046165           Msmo1/Fdps/Insig1/Lss/Pmvk/Tm7sf2     6
## GO:0072330 Gstm7/Insig1/Scd2/Elovl6/Acacb/Ptgs1/Elovl5     7
## remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)

Visualization of UP-REGULATED genes

Dot plot:

ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)

# Dotplot with improved readability

dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5 
  facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
  ggtitle("Dotplot for UP-regulated genes") + 
  theme(
    panel.spacing = unit(1, "lines"), # Space between facets
    axis.text.y = element_text( hjust = 1, size = 7), 
    plot.title = element_text(hjust = 0.5) # Put title in the center
  )

Gene concept network:

cnetplot(ego2, showCategory = 3, foldChange = UPR.preg.lact$log2FoldChange, circular = TRUE, colorEdge = TRUE)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
##  The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
##  The colorEdge parameter will be removed in the next version.

p <- upsetplot(ego2, n = 5)

p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) + 
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+

    ggtitle("Intersection of Top GO Terms for UP-regulated genes") +
    theme(plot.title = element_text(hjust = 0.5))

Tree plot

edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for UP-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )

p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
##  The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot of Enriched Terms for UP-regulated genes") +
  theme(
    plot.title = element_text(hjust = 0.5) 
  )